
#include "NeuralNet.hpp"

// STL
#include <cmath>
#include <iostream>

// LAPACK/BLAS
#if defined(WLAPACKLINUX)
#include <lapacke/lapacke.h>
#elif defined(WLAPACKMAC)
#include <Accelerate/Accelerate.h>
#endif

// jSCIENCE
#include "jstats.hpp" // sum()

// This
#include "datasets.hpp" // get_TS_stats()
#include "wrappers.hpp" // correlation_matrix()


//========================================================================
//========================================================================
//
// NAME: void NEURAL_NET::preprocess(TRAINING_SET TS, const double S_tol)
//
// DESC:  Given a training set, perform pre-processing (necessary to transform any future data). S_tol is used for data reduction with PCA.
//
// NOTES: See also preprocess_PCA() below.
//
//========================================================================
//========================================================================
void NEURAL_NET::preprocess(DataSet TS, const double S_tol)
{
    get_TS_stats(TS, m_inp_min, m_inp_max, m_inp_mean, m_inp_var, m_inp_stddev, m_out_min, m_out_max, m_out_mean, m_out_var, m_out_stddev);
    
    switch( m_transf_method )
    {
        case 0: // DO NOTHING ...
            break;
        case 2: // DO PCA ...
            this->preprocess_PCA(TS, S_tol);
            break;
/*
        case 3: // DO KERNEL LDA
            // DO KERNEL LDA
            datatransf_kernelLDA(TS);
            break;
*/
        default:
            break;
    }
    
    if( m_transf_ninp != m_ninp )
    {
        this->reduce_input();
    }
    
    this->create_lookup_tables();

    m_isTrained = true;
    
    this->get_layer_neuron_idx();
}


//========================================================================
//========================================================================
//
// NAME: void NEURAL_NET::preprocess_PCA(TRAINING_SET TS, const double S_tol)
//
// DESC: Given a training set, perform pre-processing using PCA de-correlation and dimension reduction. S_tol is the fraction of principle components to keep. Common values for S_tol are 90%, 95%, 99%, and 99.9% -- 100% means no data reduction.
//
// NOTES:
//     ! the decorrelation procees is actually simple:
//     ! first, recall that the SVD of a matrix is written as:
//          M = U*S*V^T
//          where:
//                 U & V are orthonormal and S is diagonal with the singular values
//     ! if we now right or left multiply by M^T (e.g., as in the covariance matrix M*M^T = M*M^T):
//          M*M^T = U*S^2*U^T
//          M^T*M = V*S^2*V^T
//     ! now also recall that for a symmetrix matrix A we can write:
//          A = X*L*X^T
//          where:
//                 X == eigenvectors of A in the columns
//                 L == diagonal matrix with eigenvalues
//     ! putting this together, we can calculate the SVD of M, and extract the eigenvectors and eiegenvalues of M*M^T as the columns of V and the diagonal elements of W^2
//     ! now, to decorrelate, what we want is a transformation matrix W such that:
//          W*(M*M^T)*W^T = I
//          where:
//                 I == the identity matrix (no cross correlation in the transformed data)
//     ! so:
//          W = D*V^T
//          where:
//                 D = L^(-1/2)
//     ! see:
//          http://www.netlib.no/netlib/lapack/double/dgesvd.f
//     ! as in collecting TRAINING_SET stats in get_TS_stats(), only the 0 time-delay position in TRAINING_STAT is used, since temporal positions are likely to be rather similar
//
//========================================================================
//========================================================================
void NEURAL_NET::preprocess_PCA(DataSet ds, const double S_tol)
{
    
/*
    //=========================================================
    // STEP 1. SCALE THE INPUT SO THAT IT HAS A MEAN OF 0 AND STD. DEV. OF 1
    //=========================================================
    // ! scaling the data up-front so that the std. dev. is 1 ensures that the covariance among different variables has the same scale, AND ensures that we don't need to re-scale the input std. dev., etc. below

    for( auto &pt : TS.pt )
    {
        for(int j = 0; j < m_ninp; ++j)
        {
            pt.in[0][j] = (pt.in[0][j] - m_inp_mean[j])/m_inp_stddev[j];
        }
    }
 
    //=========================================================
    // STEP 2. GET THE *SAMPLE* COVARIANCE MATRIX Q
    //=========================================================
    // ! for a description of the *sample* covariance matrix (which explains the N-1 in the denominator of matrix values), see:
    //     http://en.wikipedia.org/wiki/Sample_mean_and_covariance
    // ! in essence, the sample covariance is a K x K matrix with entries:
    //     q_jk = (1/N-1)*sum{i=1,N}[(x_ij - xjmean)*(x_ik - xkmean)]
    //          q_jk: estimate of covariance between the jth and kth variables
    //          xmean = (1/N)*sum{i=1,N}[x_i]
    // ! it is easy to understand this by noting:
    //     Cov(X_i,Y_j) = (1/k)*sum_{k}[(xi_k - ximean)*(yj_k - yjmean)]
    //          X_i     :     Variable X
    //          Y_j     :     Variable Y
    //          xi_k    :     kth data point of X_i
    //          yj_k    :     kth data point of Y_j
    //          ximean  :     Mean of X_i
    //          yjmean  :     Mean of Y_j
    // ! so, succinctly, the covariance matrix can be written C = (A*A^T)/(n-1)
    // ! TS.input is of size [n x ninp], so to calculate C of the correct size [ninp x ninp], we should consider TS.input stored as its transpose (so A == TS.input^T [ninp x n])
    // ! IMPORTANT: TS.input should be shifted already so that its mean is 0; subtracting the mean again without re-calculating will shift AWAY from 0
    
    std::vector<std::vector<double>> C( m_ninp, std::vector<double>(m_ninp, 0.0) );
    
    for(int i = 0; i < m_ninp; ++i)
    {
        for(int j = 0; j < m_ninp; ++j)
        {
            // ! the indices i,j,k are correct -- to be sure, one could manually assign A[i][j] = AT[j][i] = TS.input[j][i] = TS.input[i][j]^T and calculate A*A^T
            for( const auto &pt : TS.pt )
            {
                C[i][j] += ( pt.in[0][i]*pt.in[0][j] );
            }
            
            C[i][j] /= static_cast<double>(TS.pt.size()-1);
        }
    }
*/
    

    std::vector<std::vector<double>> C = correlation_matrix(ds);

    //=========================================================
    // STEP 3. PERFORM SVD ON C TO GET EIGENVALUES & EIGENVECTORS
    //=========================================================
    // ! IMPORTANT: all values must be passed by address in LAPACK, *NOT* value
    // ! IMPORTANT: matrices are stored by column-major in LAPACK, *NOT* row-major
    // ! V^T is returned here (as opposed to my SVD implementation which returns V)
    
    // SET TO COMPUTE NOT U BUT V^T
    char JOBU  = 'N';
    char JOBVT = 'A';
    // SET ROWS (M) & COLUMNS (N) OF A
    int M = m_ninp;
    int N = m_ninp;
    // SET THE LEADING DIMENSION FOR A AND STORE THE MATRIX A == C *IN COLUMN-MAJOR FORMAT*
    int LDA = M;
    std::vector<double> A;
    for(int j = 0; j < N; ++j)
    {
        for(int i = 0; i < M; ++i)
        {
            A.push_back(C[i][j]);
        }
    }
    // SET UP LEADING DIMENSIONS FOR U & V^T AND ALLOCATE ARRAYS TO STORE S, U, & V^T
    // ! note that U is not referenced, so it is not allocated at all
    // !!! JMM: since I don't use U, LDU may be able to be set to M
    int LDU  = M;
    int LDVT = N;
    std::vector<double> S( std::min(M,N) );
    std::vector<double> U;
    std::vector<double> VT( LDVT*N );
    // SET DIMENSION OF WORK ARRAY & SETUP WORK ARRAY
    int LWORK = std::max( 1, std::max( (3*std::min(M,N) + std::max(M,N)), 5*std::min(M,N) ) );
    std::vector<double> WORK( std::max(1, LWORK) );
    // SETUP AN INFO FLAG
    int INFO;
    
    // CALL DGEEV
    dgesvd_( &JOBU, &JOBVT, &M, &N, &*A.begin(), &LDA, &*S.begin(), &*U.begin(), &LDU, &*VT.begin(), &LDVT, &*WORK.begin(), &LWORK, &INFO );
    
    // CHECK THAT RETURN WAS NORMAL
    if( INFO != 0 )
    {
        std::cout << "Error in NEURAL_NET::preprocess_PCA: INFO from dgesvd() did not return as 0!" << std::endl;
        exit(0);
    }
    
    // NOW GET THE DE-CORRELATION MATRIX, BEFORE CLEANING UP
    std::vector<std::vector<double>> W2(  m_ninp, std::vector<double>(m_ninp, 0.0));
    std::vector<std::vector<double>> DNEW(m_ninp, std::vector<double>(m_ninp, 0.0));
    
    // GET THE EIGENVALUES FOR Qmatrix: = W^2
    //for(int i = 0; i < ninp; ++i) { W2[i][i] = S[i]*S[i]; }
    for(int i = 0; i < m_ninp; ++i)
    {
        W2[i][i] = S[i];
    }
    
    // GET THE INVERSE SQUARE ROOT OF THE EIGENVALUES
    for(int i = 0; i < m_ninp; ++i)
    {
        DNEW[i][i] = 1.0/sqrt(W2[i][i]);
    }
    

    //=========================================================
    // STEP 4. CALCULATE THE DE-CORRELATION MATRIX W = D*V^T, WHERE D = S^(-1/2)
    //=========================================================
    
    // ALLOCATE THE DE-CORRELATION MATRIX
    // ! remember that even though we may reduce dimensionality, we will store the whole thing anyway
    m_transf_matrix = std::vector<std::vector<double>>( m_ninp, std::vector<double>( m_ninp, 0.0 ) );
    
    // CALCULATE W = D*V^T = S^(-1/2)*V^T
    for(int i = 0; i < m_ninp; ++i)
    {
        for(int j = 0; j < m_ninp; ++j)
        {
            m_transf_matrix[i][j] = sqrt(1.0/S[i])*VT[j*m_ninp + i];
        }
    }
    
    // *****
/*
    m_isTrained = true;
    
    // COMPUTE A TRANSFORMED DATASET
    DataSet ds_transf;
    
    for( auto &dp : ds.pt )
    {
        DataPoint dp_scaled = dp;
        
        this->scale_input(dp_scaled.in[0]);
        
        ds_transf.pt.push_back(dp_scaled);
    }
    
    std::vector<std::vector<double>> C2 = correlation_matrix(ds_transf);

    std::vector<double> l_inp_min, l_inp_max, l_inp_mean, l_inp_var, l_inp_stddev, l_out_min, l_out_max, l_out_mean, l_out_var, l_out_stddev;
    
    get_TS_stats(ds, l_inp_min, l_inp_max, l_inp_mean, l_inp_var, l_inp_stddev, l_out_min, l_out_max, l_out_mean, l_out_var, l_out_stddev);
    
    std::cout << "BEFORE:" << std::endl;
    
    for( decltype(l_inp_min.size()) i = 0; i < l_inp_min.size(); ++i )
    {
        std::cout << "i = " << i << ":" << std::endl;
        
        std::cout << "l_inp_mean[" << i << "]: " << l_inp_mean[i] << std::endl;
        std::cout << "l_inp_var[" << i << "] : " << l_inp_var[i] << std::endl;
        
        std::cout << std::endl;
    }
    
    std::cout << "AFTER:" << std::endl;

    get_TS_stats(ds_transf, l_inp_min, l_inp_max, l_inp_mean, l_inp_var, l_inp_stddev, l_out_min, l_out_max, l_out_mean, l_out_var, l_out_stddev);
    
    std::cout << "BEFORE:" << std::endl;
    
    for( decltype(l_inp_min.size()) i = 0; i < l_inp_min.size(); ++i )
    {
        std::cout << "i = " << i << ":" << std::endl;
        
        std::cout << "l_inp_mean[" << i << "]: " << l_inp_mean[i] << std::endl;
        std::cout << "l_inp_var[" << i << "] : " << l_inp_var[i] << std::endl;
        
        std::cout << std::endl;
    }
    
    std::cout << "CORRELATION MATRIX:" << std::endl;
    
    for( decltype(C2.size()) i = 0; i < C2.size(); ++i )
    {
        for( decltype(C2.size()) j = 0; j < C2.size(); ++j )
        {
            std::cout << "C2[" << i << "][" << j << "]: " << C2[i][j] << std::endl;
        }
    }
*/
    // *****
    
    
    
    
   // get_TS_stats(TS, m_inp_min, m_inp_max, m_inp_mean, m_inp_var, m_inp_stddev, m_out_min, m_out_max, m_out_mean, m_out_var, m_out_stddev);
    
    
    
    //=========================================================
    // STEP 5. DECIDE WHICH PRINCIPAL COMPONENTS TO KEEP
    //=========================================================
    
    double S_sum         = sum(S);
    double S_sum_partial = 0.0;
    
    for( int i = 0; i < m_ninp; ++i )
    {
        S_sum_partial += S[i];
        
        std::cout << "S[" << i << "]: " << S[i] << "   " << (S_sum_partial/S_sum) << std::endl;
        
        if( (S_sum_partial/S_sum) >= S_tol )
        {
            m_transf_ninp = (i+1);
            break;
        }
    }
    
    std::cout << "transf_ninp: " << m_transf_ninp << std::endl;
}

/*
//========================================================================
//========================================================================
//
// NAME: void CNEURAL_NET::datatransf_kernelLDA(CTRAINING_SET TS)
//
// DESC: Perform a kernel LDA on a training set
//
// INPUT:
//          TS                : training set
//
// NOTES:
//     ! the following algorithm is from the paper:
//          S. Mika et al. ``Fisher discriminant analysis with kernels'' in Neural Networks for Signal Processing IX, 1999. Proceedings of the 1999 IEEE Signal Processing Society Workshop.
//     ! !!! JMM: I think that below is correct, I modified the two class algorithm to be more general (BUT IS NOT COMPLETE YET FOR MULTICLASS PROBLEMS)
//
//========================================================================
//========================================================================
void CNEURAL_NET::datatransf_kernelLDA(CTRAINING_SET TS)
{
    //=========================================================
    // SETUP THE CLASSES
    //=========================================================
    
    // ERROR CHECK
    if( isclassif != true ) { std::cout << "Error in CNEURAL_NET::datatransf_kernelLDA(): NN is NOT setup for classification, yet a LDA transformation is requested!" << endl; }
    
    // SET THE NUMBER OF CLASSES
    int nC = nout;
    
    //=========================================================
    // SETUP THE LDA KERNEL
    //=========================================================
    
    // SET THE GAUSSIAN PARAMATER ... (!!! JMM: no idea what to set here)
    LDA_sigma = 0.01;
    // .. ALONG WITH THE KERNEL FUNCTION
    function<double(vector<double>, vector<double>)> kernel;
    kernel = bind(GaussianRBF_kernel, std::placeholders::_1, std::placeholders::_2, LDA_sigma);
    
    //=========================================================
    // SEPARATE THE TRAINING SET ACCORDING TO CLASS & ASSIGN LDA_vector
    //=========================================================
    // ! note that ninput, noutput, & npts are not assigned to the training sets here
    
    vector<vector<vector<double>>> LDA_vector_sep(nC);
    LDA_vector = vector<vector<double>>(TS.npts, vector<double>(ninp, 0.0) );
    
    for(int i = 0; i < TS.npts; ++i)
    {
        // GET THE CLASS OF THIS TRAINING POINT
        int num = -1;
        for(int j = 0; j < nC; ++j)
        {
            if( TS.output[i][j] == 1.0 )
            {
                num = j;
                break;
            }
        }
        
        if( num == -1 ) { std::cout << "Error in CNEURAL_NET::datatransf_kernelLDA(): position " << i << " in TS is not classified!" << endl; }
        
        // ASSIGN TMP INPUT
        vector<double> tmp_inp;
        tmp_inp = TS.input[i];
        
        // SHIFT INPUT TO A MEAN OF 0 & STD. DEV. OF 1
        for(int j = 0; j < ninp; ++j) { tmp_inp[j] = (tmp_inp[j] - inp_mean[j])/inp_stddev[j]; }
        
        // ASSIGN THIS TO THE SEPARATED
        LDA_vector_sep[num].push_back(tmp_inp);
        
        // ALSO ASSIGN THIS TO LDA_vector
        LDA_vector[i] = tmp_inp;
    }
    
    //=========================================================
    // PERFORM LDA
    //=========================================================
    
    kernelLDA(nC, LDA_vector, LDA_vector_sep, kernel, transf_matrix);
    
    //=========================================================
    // SET THE NUMBER OF TRANSFORMED INPUT
    //=========================================================
    // ! for a C-class classification problem, we have C-1 projections
    
    transf_ninp = nC - 1;
    
    
    //=========================================================
    // GET MEAN & VARIANCE OF TRANSFORMED INPUT SET
    //=========================================================
    // THE PROJECTION OF A NEW PATTERN X ONTO W IS GIVEN BY (W*Phi(X)) = sum_{i=1,l}[alpha_i*k(X_i,X)]
    // ! see Eq. (10) in Mueller
    // ! rememnber that alpha_i is stored in row 0 of the transformation matrix
    
    std::vector<double> x;
    
    for(int i = 0; i < transf_ninp; ++i)
    {
        for(int j = 0; j < TS.npts; ++j)
        {
            // SHIFT INPUT TO A MEAN OF 0 & STD. DEV. OF 1
            vector<double> tmp_inp;
            tmp_inp = TS.input[j];
            for(int k = 0; k < ninp; ++k) { tmp_inp[k] = (tmp_inp[k] - inp_mean[k])/inp_stddev[k]; }
            
            // NOW CALCULATE TRANSFORMED VALUE
            x.push_back(0.0);
            for(unsigned int k = 0; k < LDA_vector.size(); ++k) { x[j] += transf_matrix[i][k]*GaussianRBF_kernel( LDA_vector[k], tmp_inp, LDA_sigma); }
        }
        
        // !!! JMM: no [i] position in LDA_mean, etc.
        // GET MEAN & STD. DEV. OF LDA SET
        LDA_mean   = mean(x);
        LDA_stddev = stddev(x);
    }
}
*/
